import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
from sklearn.cross_validation import cross_val_score
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
%matplotlib inline
dataset = pd.read_csv("D:\\Data Science\\Semester 2\\Python & ML data sets\\redWineQuality.csv", sep = ';')
dataset.head()
print('Function to detect outliers')
def iqr(data):
qtr1, qtr3 = np.percentile(data, [25, 75])
iqr = qtr3 - qtr1
lower_b = qtr1 - (iqr * 1.5)
upper_b = qtr3 + (iqr * 1.5)
return np.where(np.logical_and(data > lower_b, data < upper_b), data, np.median(data))
print('Using the above function for all the features')
dataset['fixed acidity'] = iqr(dataset['fixed acidity'])
dataset['volatile acidity'] = iqr(dataset['volatile acidity'])
dataset['citric acid'] = iqr(dataset['citric acid'])
dataset['residual sugar'] = iqr(dataset['residual sugar'])
dataset['chlorides'] = iqr(dataset['chlorides'])
dataset['free sulfur dioxide'] = iqr(dataset['free sulfur dioxide'])
dataset['total sulfur dioxide'] = iqr(dataset['total sulfur dioxide'])
dataset['density'] = iqr(dataset['density'])
dataset['pH'] = iqr(dataset['pH'])
dataset['sulphates'] = iqr(dataset['sulphates'])
dataset['alcohol'] = iqr(dataset['alcohol'])
print(dataset.iloc[:,0:11].describe())
print(dataset['quality'].unique())
X = dataset.iloc[:, 0:11].values
y = dataset.iloc[:, 11].values
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 0)
DTC = DecisionTreeClassifier(criterion='gini', max_depth=13)
DTC.fit(X_train, y_train)
dot_data = StringIO()
export_graphviz(DTC, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
import os
import sys
plt.figure(figsize=(10,10))
def conda_fix(graph):
path = os.path.join(sys.base_exec_prefix, "Library", "bin", "graphviz")
paths = ("dot", "twopi", "neato", "circo", "fdp")
paths = {p: os.path.join(path, "{}.exe".format(p)) for p in paths}
graph.set_graphviz_executables(paths)
conda_fix(graph)
Image(graph.create_jpeg())
#jupyter notebook --NotebookApp.iopub_data_rate_limit=100000000
pred = DTC.predict(X_test)
plt.figure(figsize=(8,5))
sn.heatmap(pd.DataFrame(confusion_matrix(y_test, pred)), annot = True, fmt = 'd')
print(DTC.score(X_train, y_train), DTC.score(X_test, y_test))
RFC = RandomForestClassifier(n_estimators=11, random_state=0, max_depth=13)
RFC.fit(X_train , y_train)
print(RFC.score(X_train, y_train), RFC.score(X_test, y_test))
print(np.mean(cross_val_score(RFC, X_test, y_test, verbose=1, cv = 9)))
svc = SVC(C = 90, gamma = 0.1)
svc.fit(X_train, y_train)
print(svc.score(X_train, y_train), svc.score(X_test, y_test))
print(np.mean(cross_val_score(svc, X_test, y_test, verbose=1, cv = 9)))